Hotel booking demand 데이터를 이용하여 EDA 진행
# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pi
import seaborn as sns
# 데이터 불러오기
hb_df = pd.read_csv("./csv/hotel_bookings.csv")
🤔
hb_df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
hb_df.dtypes
hotel object is_canceled int64 lead_time int64 arrival_date_year int64 arrival_date_month object arrival_date_week_number int64 arrival_date_day_of_month int64 stays_in_weekend_nights int64 stays_in_week_nights int64 adults int64 children float64 babies int64 meal object country object market_segment object distribution_channel object is_repeated_guest int64 previous_cancellations int64 previous_bookings_not_canceled int64 reserved_room_type object assigned_room_type object booking_changes int64 deposit_type object agent float64 company float64 days_in_waiting_list int64 customer_type object adr float64 required_car_parking_spaces int64 total_of_special_requests int64 reservation_status object reservation_status_date object dtype: object
💡 의미가 없거나 값이 너무 많이 비어있는 컬럼은 제외
hb_df.describe()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119386.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 103050.000000 | 6797.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 |
| mean | 0.370416 | 104.011416 | 2016.156554 | 27.165173 | 15.798241 | 0.927599 | 2.500302 | 1.856403 | 0.103890 | 0.007949 | 0.031912 | 0.087118 | 0.137097 | 0.221124 | 86.693382 | 189.266735 | 2.321149 | 101.831122 | 0.062518 | 0.571363 |
| std | 0.482918 | 106.863097 | 0.707476 | 13.605138 | 8.780829 | 0.998613 | 1.908286 | 0.579261 | 0.398561 | 0.097436 | 0.175767 | 0.844336 | 1.497437 | 0.652306 | 110.774548 | 131.655015 | 17.594721 | 50.535790 | 0.245291 | 0.792798 |
| min | 0.000000 | 0.000000 | 2015.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 6.000000 | 0.000000 | -6.380000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 18.000000 | 2016.000000 | 16.000000 | 8.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 62.000000 | 0.000000 | 69.290000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 69.000000 | 2016.000000 | 28.000000 | 16.000000 | 1.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 179.000000 | 0.000000 | 94.575000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 160.000000 | 2017.000000 | 38.000000 | 23.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 229.000000 | 270.000000 | 0.000000 | 126.000000 | 0.000000 | 1.000000 |
| max | 1.000000 | 737.000000 | 2017.000000 | 53.000000 | 31.000000 | 19.000000 | 50.000000 | 55.000000 | 10.000000 | 10.000000 | 1.000000 | 26.000000 | 72.000000 | 21.000000 | 535.000000 | 543.000000 | 391.000000 | 5400.000000 | 8.000000 | 5.000000 |
# 결측치 확인
hb_df.isnull().sum()
hotel 0 is_canceled 0 lead_time 0 arrival_date_year 0 arrival_date_month 0 arrival_date_week_number 0 arrival_date_day_of_month 0 stays_in_weekend_nights 0 stays_in_week_nights 0 adults 0 children 4 babies 0 meal 0 country 488 market_segment 0 distribution_channel 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 reserved_room_type 0 assigned_room_type 0 booking_changes 0 deposit_type 0 agent 16340 company 112593 days_in_waiting_list 0 customer_type 0 adr 0 required_car_parking_spaces 0 total_of_special_requests 0 reservation_status 0 reservation_status_date 0 dtype: int64
👉 chiledren, country, agent, company 컬럼을 제외하고는 결측치가 없다.
children의 결측치는 4개 밖에 없기때문에 0으로 바꾸어준다.
hb_df['children'] = hb_df['children'].fillna(0)
hb_df['children'].isnull().sum()
0
hb_df.corr()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| is_canceled | 1.000000 | 0.293123 | 0.016660 | 0.008148 | -0.006130 | -0.001791 | 0.024765 | 0.060017 | 0.005048 | -0.032491 | -0.084793 | 0.110133 | -0.057358 | -0.144381 | -0.083114 | -0.020642 | 0.054186 | 0.047557 | -0.195498 | -0.234658 |
| lead_time | 0.293123 | 1.000000 | 0.040142 | 0.126871 | 0.002268 | 0.085671 | 0.165799 | 0.119519 | -0.037622 | -0.020915 | -0.124410 | 0.086042 | -0.073548 | 0.000149 | -0.069741 | 0.151464 | 0.170084 | -0.063077 | -0.116451 | -0.095712 |
| arrival_date_year | 0.016660 | 0.040142 | 1.000000 | -0.540561 | -0.000221 | 0.021497 | 0.030883 | 0.029635 | 0.054624 | -0.013192 | 0.010341 | -0.119822 | 0.029218 | 0.030872 | 0.063457 | 0.259095 | -0.056497 | 0.197580 | -0.013684 | 0.108531 |
| arrival_date_week_number | 0.008148 | 0.126871 | -0.540561 | 1.000000 | 0.066809 | 0.018208 | 0.015558 | 0.025909 | 0.005518 | 0.010395 | -0.030131 | 0.035501 | -0.020904 | 0.005508 | -0.031201 | -0.076760 | 0.022933 | 0.075791 | 0.001920 | 0.026149 |
| arrival_date_day_of_month | -0.006130 | 0.002268 | -0.000221 | 0.066809 | 1.000000 | -0.016354 | -0.028174 | -0.001566 | 0.014544 | -0.000230 | -0.006145 | -0.027011 | -0.000300 | 0.010613 | 0.001487 | 0.044858 | 0.022728 | 0.030245 | 0.008683 | 0.003062 |
| stays_in_weekend_nights | -0.001791 | 0.085671 | 0.021497 | 0.018208 | -0.016354 | 1.000000 | 0.498969 | 0.091871 | 0.045793 | 0.018483 | -0.087239 | -0.012775 | -0.042715 | 0.063281 | 0.140739 | 0.066749 | -0.054151 | 0.049342 | -0.018554 | 0.072671 |
| stays_in_week_nights | 0.024765 | 0.165799 | 0.030883 | 0.015558 | -0.028174 | 0.498969 | 1.000000 | 0.092976 | 0.044203 | 0.020191 | -0.097245 | -0.013992 | -0.048743 | 0.096209 | 0.182382 | 0.182211 | -0.002020 | 0.065237 | -0.024859 | 0.068192 |
| adults | 0.060017 | 0.119519 | 0.029635 | 0.025909 | -0.001566 | 0.091871 | 0.092976 | 1.000000 | 0.030447 | 0.018146 | -0.146426 | -0.006738 | -0.107983 | -0.051673 | -0.035594 | 0.207793 | -0.008283 | 0.230641 | 0.014785 | 0.122884 |
| children | 0.005048 | -0.037622 | 0.054624 | 0.005518 | 0.014544 | 0.045793 | 0.044203 | 0.030447 | 1.000000 | 0.024030 | -0.032859 | -0.024730 | -0.021072 | 0.048949 | 0.041066 | 0.030931 | -0.033273 | 0.324854 | 0.056253 | 0.081745 |
| babies | -0.032491 | -0.020915 | -0.013192 | 0.010395 | -0.000230 | 0.018483 | 0.020191 | 0.018146 | 0.024030 | 1.000000 | -0.008943 | -0.007501 | -0.006550 | 0.083440 | 0.036184 | 0.019206 | -0.010621 | 0.029186 | 0.037383 | 0.097889 |
| is_repeated_guest | -0.084793 | -0.124410 | 0.010341 | -0.030131 | -0.006145 | -0.087239 | -0.097245 | -0.146426 | -0.032859 | -0.008943 | 1.000000 | 0.082293 | 0.418056 | 0.012092 | 0.031527 | -0.244586 | -0.022235 | -0.134314 | 0.077090 | 0.013050 |
| previous_cancellations | 0.110133 | 0.086042 | -0.119822 | 0.035501 | -0.027011 | -0.012775 | -0.013992 | -0.006738 | -0.024730 | -0.007501 | 0.082293 | 1.000000 | 0.152728 | -0.026993 | -0.012488 | -0.184574 | 0.005929 | -0.065646 | -0.018492 | -0.048384 |
| previous_bookings_not_canceled | -0.057358 | -0.073548 | 0.029218 | -0.020904 | -0.000300 | -0.042715 | -0.048743 | -0.107983 | -0.021072 | -0.006550 | 0.418056 | 0.152728 | 1.000000 | 0.011608 | 0.023252 | -0.208557 | -0.009397 | -0.072144 | 0.047653 | 0.037824 |
| booking_changes | -0.144381 | 0.000149 | 0.030872 | 0.005508 | 0.010613 | 0.063281 | 0.096209 | -0.051673 | 0.048949 | 0.083440 | 0.012092 | -0.026993 | 0.011608 | 1.000000 | 0.067010 | 0.122098 | -0.011634 | 0.019618 | 0.065620 | 0.052833 |
| agent | -0.083114 | -0.069741 | 0.063457 | -0.031201 | 0.001487 | 0.140739 | 0.182382 | -0.035594 | 0.041066 | 0.036184 | 0.031527 | -0.012488 | 0.023252 | 0.067010 | 1.000000 | 0.350746 | -0.055151 | -0.024695 | 0.177353 | 0.034162 |
| company | -0.020642 | 0.151464 | 0.259095 | -0.076760 | 0.044858 | 0.066749 | 0.182211 | 0.207793 | 0.030931 | 0.019206 | -0.244586 | -0.184574 | -0.208557 | 0.122098 | 0.350746 | 1.000000 | 0.000411 | 0.086376 | -0.012916 | -0.098558 |
| days_in_waiting_list | 0.054186 | 0.170084 | -0.056497 | 0.022933 | 0.022728 | -0.054151 | -0.002020 | -0.008283 | -0.033273 | -0.010621 | -0.022235 | 0.005929 | -0.009397 | -0.011634 | -0.055151 | 0.000411 | 1.000000 | -0.040756 | -0.030600 | -0.082730 |
| adr | 0.047557 | -0.063077 | 0.197580 | 0.075791 | 0.030245 | 0.049342 | 0.065237 | 0.230641 | 0.324854 | 0.029186 | -0.134314 | -0.065646 | -0.072144 | 0.019618 | -0.024695 | 0.086376 | -0.040756 | 1.000000 | 0.056628 | 0.172185 |
| required_car_parking_spaces | -0.195498 | -0.116451 | -0.013684 | 0.001920 | 0.008683 | -0.018554 | -0.024859 | 0.014785 | 0.056253 | 0.037383 | 0.077090 | -0.018492 | 0.047653 | 0.065620 | 0.177353 | -0.012916 | -0.030600 | 0.056628 | 1.000000 | 0.082626 |
| total_of_special_requests | -0.234658 | -0.095712 | 0.108531 | 0.026149 | 0.003062 | 0.072671 | 0.068192 | 0.122884 | 0.081745 | 0.097889 | 0.013050 | -0.048384 | 0.037824 | 0.052833 | 0.034162 | -0.098558 | -0.082730 | 0.172185 | 0.082626 | 1.000000 |
👉 데이터 살펴보기 (is_canceled 컬럼과의 관계)
hb_df['is_canceled'].value_counts()
0 75166 1 44224 Name: is_canceled, dtype: int64
fig = px.histogram(hb_df, x="is_canceled", color='hotel')
fig.update_xaxes(type='category')
fig.show()
# pi.write_json(fig, 'chart1.json')
👉 취소하지 않은 고객의 데이터가 더 많다. 그리고 City Hotel의 데이터가 더 많다.
fig = px.histogram(hb_df, x="arrival_date_month")
fig.update_xaxes(type='category')
fig.show()
👉 여름에 예약하는 사람이 많고, 겨울에 예약하는 사람이 적다.
fig = px.pie(hb_df, names='reserved_room_type')
fig.show()
👉 예약된 객실 유형의 코드는 A, D, E, F, G 순으로 많다.
fig = px.histogram(hb_df, x="lead_time", y='is_canceled', nbins=100)
fig.show()
❌ 오래전에 예약한 사람보다 급하게 예약한 사람들이 취소하는 사람이 더 많다.
fig = px.histogram(hb_df, x="booking_changes", color='is_canceled')
fig.show()
⭕ 예약 변경 횟수가 적을수록 에약을 취소하는 사람이 많다.
fig = px.imshow(hb_df[['is_canceled','arrival_date_month']].groupby('arrival_date_month').mean())
fig.show()
👉 4월과 6월에 취소하는 사람이 많고, 1월과 11월에 취소하는 사람이 적다.